from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle
import os
import plotly.express as px
import pandas as pd
import numpy as np
from scipy.special import softmax
import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
def construct_databases(base_loc):
mappings = {}
for directory in os.listdir(base_loc):
if directory.startswith("."):
continue
loc = base_loc + "/" + directory
t = ibis.read_parquet(loc)
mappings["t_" + directory] = t
return mappings
# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))
def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None, size=2, filter=None):
embeddings = mapper.embedding_.T
df["x"], df["y"], df["z"] = embeddings
fig = px.scatter_3d(df[filter(df)] if filter is not None else df,
x="x", y="y", z="z", color=color,
hover_name=hover_name, hover_data=hover_data)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.update_traces(marker=dict(size=size))
return fig.show()
gene_df = pickle.load(open("models/gene_df.sav", 'rb'))
disease_df = pickle.load(open("models/disease_df.sav", 'rb'))
gene_desc_mapper = pickle.load(open("models/gene_desc_mapper.sav", 'rb'))
gene_go_mapper = pickle.load(open("models/gene_go_mapper.sav", 'rb'))
disease_expression_mapper = pickle.load(open("models/disease_expression_mapper.sav", 'rb'))
disease_desc_mapper = pickle.load(open("models/disease_desc_mapper.sav", 'rb'))
gene_nucleotide_mapper = pickle.load(open("models/gene_nucleotide_mapper.sav", 'rb'))
gene_protein_mapper = pickle.load(open("models/gene_protein_mapper.sav", 'rb'))
t_gene_df = (ibis.memtable(gene_df)
.relabel({"id":"targetId", "index":"geneIndex"})) # quck fixes to harmonize
t_disease_df = (ibis.memtable(disease_df)
.relabel({"index":"diseaseIndex"})) # quick fixe to harmonize
t_gene_df
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓ ┃ targetId ┃ approvedName ┃ go_desc ┃ functionDescriptions ┃ location ┃ truncDesc ┃ nucleotide ┃ contig ┃ protein ┃ geneIndex ┃ x ┃ y ┃ z ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩ │ string │ string │ string │ string │ string │ string │ string │ string │ string │ int64 │ float32 │ float32 │ float32 │ ├─────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────┼────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────┼──────────────────────────────────────────────────────────────────────────────────┼───────────┼───────────┼──────────┼──────────┤ │ ENSG00000059588 │ TAR (HIV-1) RNA binding protein 1 │ tRNA (guanine) methyltransferase activity regulation of transcription by RNA po… │ Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates R… │ Nuclear speckles │ Probable S-adenosyl-L-methionine-dependent methylt │ ATGGAGTGGGTGCTCGCGGAAGCGCTGCTCTCGCAGAGCCGGGACCCCCGGGCCCTGCTTGGGGCGCTGTGCCAAGGGG… │ 1 │ MEWVLAEALLSQSRDPRALLGALCQGEASAERVETLRFLLQRLEDEEARGSGGAGALPEAAREVAAGYLVPLLRSLRGR… │ 0 │ 10.431715 │ 4.211885 │ 9.016659 │ │ ENSG00000072071 │ adhesion G protein-coupled receptor L1 │ latrotoxin receptor activity G protein-coupled receptor signaling pathway plasm… │ Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitat… │ Cell membrane │ Calcium-independent receptor of high affinity for │ TCTTTTTTTTTTTTTTTCCTAATTTTTGGTCGGCGGCGGTGCTGGGCCAGGGGAAGGAAGGGACACGGAGGCCGCCCTC… │ 19 │ SFFFFFLIFGRRRCWARGRKGHGGRPRPATSYPLPPSPGSGRCAGRGARVRRAAGETRWADPREALDRLVVQAVVPAR*… │ 1 │ 9.038093 │ 6.293142 │ 4.004947 │ │ ENSG00000073536 │ notchless homolog 1 │ skeletal system morphogenesis positive regulation of canonical Wnt signaling pa… │ Plays a role in regulating Notch activity. Plays a role in regulating the expre… │ Nucleus │ Plays a role in regulating Notch activity. Plays a │ GGACGCAGGATGGCGGCAGCAGTGCCGGTGGGTGTGCGTGGATGGGGGCGGGGGGCGTCGCCGCGGGGCGCTAGGGCCC… │ 17 │ GRRMAAAVPVGVRGWGRGASPRGARARVPEA*ERRSGALLGGRPD*LPPHLLAQDEAVARDVQRLLVQFQDEGGQLLGS… │ 2 │ 8.673260 │ 5.322880 │ 6.614601 │ │ ENSG00000075290 │ Wnt family member 8B │ canonical Wnt signaling pathway signal transduction nervous system development … │ Ligand for members of the frizzled family of seven transmembrane receptors. May… │ Secreted │ Ligand for members of the frizzled family of seven │ CGCTTACACACCAAGGAAGTTGGGCTTTGAGAATTCCATCCCACTGGCACTGAGGAGAATATTTCTCCGTCTTGCTTAC… │ 10 │ RLHTKEVGL*EFHPTGTEENISPSCLPISQFFGIFSSCYSRGLCFFQSLLCTSVFSPVSSNSATAGR*TIS**LVQRLT… │ 3 │ 7.238286 │ 5.980227 │ 6.065460 │ │ ENSG00000083454 │ purinergic receptor P2X 5 │ positive regulation of calcium ion transport into cytosol positive regulation o… │ Receptor for ATP that acts as a ligand-gated ion channel. │ Membrane │ Receptor for ATP that acts as a ligand-gated ion c │ CGGGCGCCGGGCGCGCAGGGACCGAGGGACCGAGTGCTCCCCATGAGCGCACGTGGGCCGGGCGGTCCGCAAGCCCGGC… │ 17 │ RAPGAQGPRDRVLPMSARGPGGPQARLRARHGAGGLQGALPVAVRLQDREVCHRQEQEGGPAVPAAAGLHPGVPGRMGV… │ 4 │ 9.326583 │ 6.167222 │ 3.889343 │ │ ENSG00000083782 │ epiphycan │ articular cartilage development glycosaminoglycan binding bone development extr… │ May have a role in bone formation and also in establishing the ordered structur… │ Secreted │ May have a role in bone formation and also in esta │ ACAGCCATTGGTCAGGGGCAAATACCACTAGCTCTGCATCCTCAGTCACTTTGTGCCATTTCATCAGGTCAGAGCCAAA… │ 12 │ TAIGQGQIPLALHPQSLCAISSGQSQRKA*KMKTLAGLVLGLVIFDAAVTAPTLESINYDSETYDATLEDLDNLYNYEN… │ 5 │ 9.566043 │ 3.741266 │ 9.382458 │ │ ENSG00000087087 │ serrate, RNA effector molecule │ primary miRNA processing regulation of DNA-templated transcription DNA binding … │ Acts as a mediator between the cap-binding complex (CBC) and the primary microR… │ Nucleus │ Acts as a mediator between the cap-binding complex │ GTGCCTCGGAGGCGTGGGTGACGCAGGCGCAGCGCGGGCTGCGCGCGCTACTGCCCATCCCCGGTTGTCCCACTTTTGT… │ 7 │ VPRRRG*RRRSAGCARYCPSPVVPLLFASLRPSTQELRLRLALLEVLVARPRPRSP*NLARPSASPTAAAAPRPPQTVP… │ 6 │ 6.632429 │ 6.814382 │ 5.678242 │ │ ENSG00000087502 │ ERGIC and golgi 2 │ retrograde vesicle-mediated transport, Golgi to endoplasmic reticulum transport… │ Possible role in transport between endoplasmic reticulum and Golgi. . │ Endoplasmic reticulum │ Possible role in transport between endoplasmic ret │ TCTGTGAAACATGGCGGTAGGCTGGGACCATAACACAAGCATGACTATATGAAGGAAGAGGAAGGTTTTCCTGAAGATG… │ 12 │ SVKHGGRLGP*HKHDYMKEEEGFPEDEATESEKNFKFGKRVGCLSEGS*ELCRDFSQWRYSFSNSIYNYGFINHNGILS… │ 7 │ 7.251029 │ 1.868455 │ 9.219758 │ │ ENSG00000092201 │ SPT16 homolog, facilitates chromatin remodeling subunit │ nucleoplasm nucleoplasm FACT complex RNA binding nucleoplasm nucleoplasm transc… │ Component of the FACT complex, a general chromatin factor that acts to reorgani… │ Nucleus │ Component of the FACT complex, a general chromatin │ GGCAGACCGTCACGTGACGACGTCGATTCGCGTGCGGCAGTGGCGAAGTTGACAAACCCCGCGAAAATCGACTCTTTGC… │ 14 │ GRPSRDDVDSRAAVAKLTNPAKIDSLHRTFC*FSLVFLSLFPPSIRKRVGKKQNKQTNKKKT*RCWDPEAERASLRSIL… │ 8 │ 10.341167 │ 4.429931 │ 9.074904 │ │ ENSG00000102078 │ solute carrier family 25 member 14 │ plasma membrane mitochondrial inner membrane mitochondrial inner membrane mitoc… │ Participates in the mitochondrial proton leak measured in brain mitochondria. │ Mitochondrion inner membrane │ Participates in the mitochondrial proton leak meas │ GTTGGTTTCAATGCTTCCGGGTTGGCGCTGCAGTGGCGTTTCCGACTGTGGGAGCCTCAGCTTCCCAGTCGTCCGATGA… │ Other │ VGFNASGLALQWRFRLWEPQLPSRPMSPSS*VPSLSFTLLASVVLLLRLNPASSTPLGGRLLQAPPFSPRARSDS*GTG… │ 9 │ 9.445682 │ 3.893537 │ 8.485611 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────┴────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────┴──────────────────────────────────────────────────────────────────────────────────┴───────────┴───────────┴──────────┴──────────┘
t_disease_df
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓ ┃ name ┃ diseaseId ┃ desc ┃ most_expressed_in_system ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃ diseaseIndex ┃ x ┃ y ┃ z ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩ │ string │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ int64 │ float32 │ float32 │ float32 │ ├──────────────────────────────────────────────────────┼─────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┼──────────────┼──────────┼──────────┼──────────┤ │ gonorrhea │ DOID_7551 │ A primary bacterial infectious disease that is a sexually transmitted infection… │ connective_tissue │ -0.096844 │ -0.124075 │ -0.269214 │ -0.183644 │ 0.082471 │ -0.241006 │ -0.171879 │ -0.138369 │ -0.388618 │ -0.267145 │ -0.324140 │ -0.419989 │ -0.315517 │ -0.397776 │ -0.165230 │ -0.310319 │ 0 │ 9.960172 │ 7.799116 │ 3.600109 │ │ respiratory quotient │ EFO_0005189 │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ musculoskeletal_system │ 2.000000 │ -0.400000 │ -0.400000 │ -0.666667 │ 0.000000 │ 0.300000 │ 0.000000 │ -0.352941 │ -0.500000 │ 0.000000 │ -0.800000 │ 0.437500 │ -1.000000 │ -0.333333 │ 0.000000 │ 0.083333 │ 1 │ 9.965736 │ 7.900253 │ 5.888873 │ │ response to silica exposure │ EFO_0005853 │ short or long term physiological response of an organism, eg in terms of deposi… │ connective_tissue │ -0.500000 │ 0.266667 │ -0.733333 │ -0.222222 │ 0.666667 │ 0.266667 │ 0.571429 │ 0.058824 │ -0.138889 │ -0.333333 │ -0.750000 │ 0.479167 │ -0.666667 │ -0.185185 │ 0.333333 │ -0.416667 │ 2 │ 9.978691 │ 6.141375 │ 0.409376 │ │ response to thiopurine │ EFO_0006317 │ Any process that results in a change in state or activity of a cell or an organ… │ hemolymphoid_system │ 0.250000 │ 0.250000 │ -0.426389 │ 0.083333 │ -0.125000 │ -0.112500 │ 0.482143 │ 0.209099 │ 0.125000 │ -0.375000 │ -0.402778 │ 0.078125 │ -0.875000 │ -0.588542 │ -0.375000 │ -0.741667 │ 3 │ 9.299581 │ 6.741986 │ 1.319081 │ │ cryptococcosis │ EFO_0007229 │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ hemolymphoid_system │ -0.274286 │ -0.309714 │ -0.405206 │ -0.312381 │ -0.468571 │ -0.230857 │ -0.031837 │ -0.155399 │ -0.220000 │ -0.451429 │ -0.514404 │ -0.271786 │ -0.537143 │ -0.509762 │ -0.091429 │ -0.504450 │ 4 │ 8.008451 │ 6.225818 │ 1.558063 │ │ Nematoda infectious disease │ EFO_0007391 │ Infections caused by nematode larvae which never develop into the adult stage a… │ anatomical_wall │ -0.345930 │ -0.247674 │ -0.374354 │ -0.184109 │ -0.255814 │ -0.322674 │ -0.178571 │ -0.090544 │ -0.357558 │ -0.325581 │ -0.578775 │ -0.430596 │ -0.430233 │ -0.452116 │ 0.005814 │ -0.487350 │ 5 │ 7.896668 │ 6.960781 │ 2.706551 │ │ interleukin 1 Receptor accessory protein measurement │ EFO_0008167 │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ hematopoietic_system │ -0.750000 │ -0.500000 │ -0.800000 │ -0.500000 │ -0.500000 │ 0.300000 │ -0.357143 │ -0.676471 │ -0.750000 │ -1.000000 │ -0.925000 │ -0.593750 │ -1.000000 │ -0.555556 │ -1.000000 │ -0.666667 │ 6 │ 8.840138 │ 4.281627 │ 1.280424 │ │ interleukin 23 receptor measurement │ EFO_0008181 │ quantification of the amount of interleukin 23 receptor in a sample │ hemolymphoid_system │ -0.500000 │ -0.650000 │ -0.944444 │ -0.666667 │ -0.500000 │ -0.325000 │ -0.035714 │ -0.133272 │ -0.562500 │ -1.000000 │ -0.805556 │ -0.312500 │ -1.000000 │ -0.315972 │ -0.750000 │ -0.354167 │ 7 │ 8.678880 │ 4.615093 │ 1.183100 │ │ atypical femoral fracture │ EFO_0009960 │ Stress or insufficency fractures occurring in the femoral shaft, typically in r… │ hemolymphoid_system │ -0.333333 │ -0.133333 │ -0.633333 │ -0.444444 │ -0.666667 │ 0.133333 │ 0.238095 │ -0.607843 │ -0.750000 │ -0.333333 │ -0.816667 │ 0.125000 │ -0.666667 │ -0.666667 │ 0.000000 │ -0.750000 │ 8 │ 8.702263 │ 5.653661 │ 0.854118 │ │ CD40 measurement │ EFO_0010586 │ quantification of the amount of CD40 in a sample │ digestive_system │ -0.750000 │ 0.500000 │ -0.400000 │ 0.333333 │ -0.500000 │ -0.600000 │ 0.214286 │ 0.705882 │ 0.458333 │ 0.500000 │ -0.875000 │ -0.468750 │ -0.500000 │ -0.111111 │ 0.000000 │ -0.083333 │ 9 │ 7.339328 │ 6.733737 │ 3.136575 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────────────────────────────────────────────────┴─────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┴──────────────┴──────────┴──────────┴──────────┘
t_samples = (t_associationByOverallDirect
.inner_join(t_disease_df.select("diseaseId", "diseaseIndex"), "diseaseId")
.inner_join(t_gene_df.select("targetId", "geneIndex"), "targetId")
.filter(_.evidenceCount > 1)
.mutate(
is_train = ibis.random() >= 0.2, # 80% will be for training, the other 20% for testing
random_order = ibis.random()) # Used to shuffle the data
.order_by(_.random_order))
t_samples
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ diseaseId ┃ targetId ┃ score ┃ evidenceCount ┃ diseaseIndex ┃ geneIndex ┃ is_train ┃ random_order ┃ ┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ string │ string │ float64 │ int64 │ int64 │ int64 │ boolean │ float64 │ ├───────────────┼─────────────────┼──────────┼───────────────┼──────────────┼───────────┼──────────┼──────────────┤ │ EFO_0004340 │ ENSG00000185513 │ 0.223393 │ 5 │ 6052 │ 4212 │ True │ 8.789357e-07 │ │ MONDO_0019457 │ ENSG00000116478 │ 0.046197 │ 2 │ 2153 │ 9811 │ False │ 1.127995e-05 │ │ Orphanet_1997 │ ENSG00000125046 │ 0.040210 │ 2 │ 5811 │ 4434 │ True │ 1.136772e-05 │ │ HP_0002373 │ ENSG00000123607 │ 0.086279 │ 2 │ 10767 │ 9561 │ True │ 1.171301e-05 │ │ EFO_0005547 │ ENSG00000026025 │ 0.044606 │ 4 │ 428 │ 14422 │ True │ 2.124696e-05 │ │ HP_0012758 │ ENSG00000132535 │ 0.426218 │ 4 │ 6210 │ 7932 │ True │ 2.341042e-05 │ │ EFO_0004514 │ ENSG00000010270 │ 0.363329 │ 9 │ 9578 │ 13902 │ False │ 2.359785e-05 │ │ EFO_0000708 │ ENSG00000171195 │ 0.268628 │ 5 │ 3693 │ 5873 │ False │ 2.487563e-05 │ │ MONDO_0100330 │ ENSG00000135318 │ 0.004065 │ 2 │ 7145 │ 6197 │ True │ 2.563954e-05 │ │ EFO_0001054 │ ENSG00000185338 │ 0.025761 │ 3 │ 3398 │ 15839 │ True │ 2.632733e-05 │ │ … │ … │ … │ … │ … │ … │ … │ … │ └───────────────┴─────────────────┴──────────┴───────────────┴──────────────┴───────────┴──────────┴──────────────┘
def extract_embedding(mapper, index):
return mapper.embedding_[index]
def create_feature(gene_index, disease_index, gene_mappers, disease_mappers):
feature_vec = []
for disease_mapper in disease_mappers:
feature_vec.extend(extract_embedding(disease_mapper, disease_index))
for gene_mapper in gene_mappers:
feature_vec.extend(extract_embedding(gene_mapper, gene_index))
# Using softmax
# (approximating a distrubution over gene/disease features)
feature_vec = softmax(feature_vec)
return feature_vec
def extract_features(sample_entries, gene_mappers, disease_mappers):
feature_mat = []
target_vec = []
for entry in sample_entries:
feature_vec = create_feature(entry["geneIndex"],
entry["diseaseIndex"],
gene_mappers, disease_mappers)
feature_mat.append(feature_vec)
target_vec.append(entry["score"])
feature_mat = np.asarray(feature_mat)
feature_mat[np.isnan(feature_mat)] = 0
return np.asarray(feature_mat), np.asarray(target_vec)
test_entries = []
train_entries = []
gene_mappers = [gene_desc_mapper, gene_go_mapper, gene_nucleotide_mapper, gene_protein_mapper]
disease_mappers = [disease_desc_mapper, disease_expression_mapper]
# Pyarrow record batch readers are useful for iterating over large datasets
# It's a bit overkill in this example
# Also useful for online learning algorithms (learning iteratively over small batches)
def to_chunk_iterator(table, chunk_size=100):
record_reader = table.to_pyarrow_batches(chunk_size=chunk_size)
while True:
try:
yield record_reader.read_next_batch().to_pylist()
except StopIteration:
break
for chunk in to_chunk_iterator(t_samples, chunk_size=1000):
test_entries.extend(entry for entry in chunk if entry["is_train"] == False)
train_entries.extend([entry for entry in chunk if entry["is_train"] == True])
# In this demo, extracting features and keeping them all in memory
# Not always a good idea depending upon how large they are
train_X, train_Y = \
extract_features(train_entries, gene_mappers, disease_mappers)
test_X, test_Y = \
extract_features(test_entries, gene_mappers, disease_mappers)
model = LGBMRegressor(objective="cross_entropy", num_iterations=40, learning_rate=0.05)
pipe = Pipeline([('scalar', StandardScaler()),
('model', model)])
pipe.fit(train_X, train_Y)
Pipeline(steps=[('scalar', StandardScaler()),
('model',
LGBMRegressor(learning_rate=0.05, num_iterations=40,
objective='cross_entropy'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('scalar', StandardScaler()),
('model',
LGBMRegressor(learning_rate=0.05, num_iterations=40,
objective='cross_entropy'))])StandardScaler()
LGBMRegressor(learning_rate=0.05, num_iterations=40, objective='cross_entropy')
# Quick measures for comparison (based on test predictions)
mse = ((pipe.predict(test_X) - test_Y)**2).mean()
rss = ((pipe.predict(test_X) - test_Y)**2).sum()
tss = ((test_Y - test_Y.mean())**2).sum()
r2 = rss/tss
print("MSE: {}\nR-squared: {}".format(mse, r2))
MSE: 0.016242092156562287 R-squared: 0.8318774619447576
# In this example, predicting associated targets given Alzheimer's Disease
alz_index = t_disease_df.filter(_.diseaseId == "MONDO_0004975").diseaseIndex.first().execute()
X = []
for gene_index in range(t_gene_df.count().execute()):
fvec = create_feature(gene_index=gene_index, disease_index=alz_index, disease_mappers=disease_mappers, gene_mappers=gene_mappers)
X.append(fvec)
X = np.asarray(X)
scores = pipe.predict(X) # we could normalize this to a true distribution, but it wouldn't affect rankings
gene_df["predictedScore"] = scores
# We are going to visualize the results by coloring our gene description embedding plot with them
construct_scatterplot(gene_df, gene_desc_mapper, hover_name="approvedName",
color="predictedScore", hover_data=["location", "id"], size=2.5)
alz_index = t_disease_df.filter(_.diseaseId == "MONDO_0004975").select("diseaseId", "diseaseIndex").mutate(score=0).diseaseIndex.first().execute()
X = []
for gene_index in range(t_gene_df.count().execute()):
fvec = create_feature(gene_index=gene_index, disease_index=alz_index, disease_mappers=disease_mappers, gene_mappers=gene_mappers)
X.append(fvec)
X = np.asarray(X)
scores = pipe.predict(X)
gene_df["predictedScore"] = scores
construct_scatterplot(gene_df, gene_desc_mapper, hover_name="approvedName",
color="predictedScore", hover_data=["location", "id"], size=2.5,
filter = lambda x: x["predictedScore"] > x["predictedScore"].quantile(0.99))
nt = ibis.memtable(gene_df)
(nt
.order_by(ibis.desc(_.predictedScore))
.head(60)
).execute()[["predictedScore", "id", "approvedName", "functionDescriptions"]]
| predictedScore | id | approvedName | functionDescriptions | |
|---|---|---|---|---|
| 0 | 0.290234 | ENSG00000022355 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel which is a compo... |
| 1 | 0.288247 | ENSG00000113327 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel which is a compo... |
| 2 | 0.288247 | ENSG00000145864 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel which is a compo... |
| 3 | 0.273529 | ENSG00000111886 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 4 | 0.271424 | ENSG00000268089 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 5 | 0.271424 | ENSG00000011677 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 6 | 0.271424 | ENSG00000146276 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 7 | 0.271424 | ENSG00000145863 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 8 | 0.271424 | ENSG00000183185 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 9 | 0.271424 | ENSG00000163288 | gamma-aminobutyric acid type A receptor subuni... | Component of the heteropentameric receptor for... |
| 10 | 0.271424 | ENSG00000186297 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel subunit which is... |
| 11 | 0.271424 | ENSG00000102287 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 12 | 0.271424 | ENSG00000182256 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 13 | 0.271424 | ENSG00000151834 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel which is a compo... |
| 14 | 0.271424 | ENSG00000109158 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 15 | 0.271424 | ENSG00000163285 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 16 | 0.271424 | ENSG00000094755 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 17 | 0.270600 | ENSG00000187730 | gamma-aminobutyric acid type A receptor subuni... | GABA, the major inhibitory neurotransmitter in... |
| 18 | 0.260201 | ENSG00000166206 | gamma-aminobutyric acid type A receptor subuni... | Ligand-gated chloride channel which is a compo... |
| 19 | 0.231567 | ENSG00000178084 | 5-hydroxytryptamine receptor 3C | This is one of the several different receptors... |
| 20 | 0.229744 | ENSG00000186090 | 5-hydroxytryptamine receptor 3D | This is one of the several different receptors... |
| 21 | 0.229744 | ENSG00000186038 | 5-hydroxytryptamine receptor 3E | This is one of the several different receptors... |
| 22 | 0.229433 | ENSG00000166736 | 5-hydroxytryptamine receptor 3A | This is one of the several different receptors... |
| 23 | 0.227939 | ENSG00000186919 | zinc activated ion channel | Zinc-activated ligand-gated ion channel. . |
| 24 | 0.227847 | ENSG00000181752 | olfactory receptor family 8 subfamily K member 5 | Odorant receptor. . |
| 25 | 0.227847 | ENSG00000280090 | olfactory receptor family 8 subfamily B member 4 | Odorant receptor. . |
| 26 | 0.226560 | ENSG00000089041 | purinergic receptor P2X 7 | Receptor for ATP that acts as a ligand-gated i... |
| 27 | 0.224052 | ENSG00000149305 | 5-hydroxytryptamine receptor 3B | This is one of the several different receptors... |
| 28 | 0.223978 | ENSG00000108405 | purinergic receptor P2X 1 | Ligand-gated ion channel with relatively high ... |
| 29 | 0.223754 | ENSG00000176884 | glutamate ionotropic receptor NMDA type subunit 1 | Component of NMDA receptor complexes that func... |
| 30 | 0.221723 | ENSG00000083454 | purinergic receptor P2X 5 | Receptor for ATP that acts as a ligand-gated i... |
| 31 | 0.221490 | ENSG00000099957 | purinergic receptor P2X 6 | Receptor for ATP that acts as a ligand-gated i... |
| 32 | 0.213317 | ENSG00000273079 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 33 | 0.213285 | ENSG00000183454 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 34 | 0.209648 | ENSG00000166862 | calcium voltage-gated channel auxiliary subuni... | Regulates the trafficking and gating propertie... |
| 35 | 0.208525 | ENSG00000105464 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 36 | 0.208393 | ENSG00000116032 | glutamate ionotropic receptor NMDA type subuni... | NMDA receptor subtype of glutamate-gated ion c... |
| 37 | 0.202177 | ENSG00000143252 | succinate dehydrogenase complex subunit C | Membrane-anchoring subunit of succinate dehydr... |
| 38 | 0.200488 | ENSG00000198785 | glutamate ionotropic receptor NMDA type subuni... | NMDA receptor subtype of glutamate-gated ion c... |
| 39 | 0.199298 | ENSG00000136521 | NADH:ubiquinone oxidoreductase subunit B5 | Accessory subunit of the mitochondrial membran... |
| 40 | 0.198087 | ENSG00000006116 | calcium voltage-gated channel auxiliary subuni... | Regulates the trafficking to the somatodendrit... |
| 41 | 0.197729 | ENSG00000151366 | NADH:ubiquinone oxidoreductase subunit C2 | Accessory subunit of the mitochondrial membran... |
| 42 | 0.197039 | ENSG00000075461 | calcium voltage-gated channel auxiliary subuni... | Regulates the activity of L-type calcium chann... |
| 43 | 0.194069 | ENSG00000166136 | NADH:ubiquinone oxidoreductase subunit B8 | Accessory subunit of the mitochondrial membran... |
| 44 | 0.188456 | ENSG00000142408 | calcium voltage-gated channel auxiliary subuni... | Regulates the activity of L-type calcium chann... |
| 45 | 0.188368 | ENSG00000169432 | sodium voltage-gated channel alpha subunit 9 | Mediates the voltage-dependent sodium ion perm... |
| 46 | 0.187995 | ENSG00000267855 | NADH:ubiquinone oxidoreductase subunit A7 | Accessory subunit of the mitochondrial membran... |
| 47 | 0.184663 | ENSG00000139180 | NADH:ubiquinone oxidoreductase subunit A9 | Accessory subunit of the mitochondrial membran... |
| 48 | 0.184144 | ENSG00000181273 | olfactory receptor family 5 subfamily AK member 2 | Odorant receptor. . |
| 49 | 0.184144 | ENSG00000150261 | olfactory receptor family 8 subfamily K member 1 | Odorant receptor. . |
| 50 | 0.184144 | ENSG00000196119 | olfactory receptor family 8 subfamily A member 1 | Odorant receptor. . |
| 51 | 0.184144 | ENSG00000181767 | olfactory receptor family 8 subfamily H member 2 | Odorant receptor. . |
| 52 | 0.184144 | ENSG00000280314 | olfactory receptor family 8 subfamily K member... | Odorant receptor. . |
| 53 | 0.184144 | ENSG00000181371 | olfactory receptor family 5 subfamily M member 8 | Odorant receptor. . |
| 54 | 0.184144 | ENSG00000186119 | olfactory receptor family 5 subfamily D member 18 | Odorant receptor. . |
| 55 | 0.184144 | ENSG00000181693 | olfactory receptor family 8 subfamily H member 1 | Odorant receptor. . |
| 56 | 0.184144 | ENSG00000279395 | olfactory receptor family 5 subfamily L member 1 | Odorant receptor. . |
| 57 | 0.184144 | ENSG00000167825 | olfactory receptor family 5 subfamily I member 1 | Odorant receptor. . |
| 58 | 0.184144 | ENSG00000181761 | olfactory receptor family 8 subfamily H member 3 | Odorant receptor. . |
| 59 | 0.184144 | ENSG00000196578 | olfactory receptor family 5 subfamily AC member 2 | Odorant receptor. . |
(t_associationByOverallDirect
.filter(_.diseaseId == "MONDO_0004975")
.order_by(ibis.desc(_.score))
.head(40)
.inner_join(nt, _.targetId == nt.id)
.order_by(ibis.desc(_.score)) # inner join messes up the sort order for some reason
.execute()
)[["score", "predictedScore", "id", "approvedName", "functionDescriptions"]]
| score | predictedScore | id | approvedName | functionDescriptions | |
|---|---|---|---|---|---|
| 0 | 0.824413 | 0.091761 | ENSG00000142192 | amyloid beta precursor protein | Functions as a cell surface receptor and perfo... |
| 1 | 0.634681 | 0.087795 | ENSG00000087085 | acetylcholinesterase (Cartwright blood group) | Hydrolyzes rapidly the acetylcholine neurotran... |
| 2 | 0.625412 | 0.213285 | ENSG00000183454 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 3 | 0.623134 | 0.223754 | ENSG00000176884 | glutamate ionotropic receptor NMDA type subunit 1 | Component of NMDA receptor complexes that func... |
| 4 | 0.621476 | 0.094793 | ENSG00000114200 | butyrylcholinesterase | Esterase with broad substrate specificity. Con... |
| 5 | 0.613484 | 0.213317 | ENSG00000273079 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 6 | 0.609383 | 0.208393 | ENSG00000116032 | glutamate ionotropic receptor NMDA type subuni... | NMDA receptor subtype of glutamate-gated ion c... |
| 7 | 0.609168 | 0.090811 | ENSG00000137642 | sortilin related receptor 1 | Sorting receptor that directs several proteins... |
| 8 | 0.607102 | 0.083939 | ENSG00000164885 | cyclin dependent kinase 5 | Proline-directed serine/threonine-protein kina... |
| 9 | 0.604982 | 0.200488 | ENSG00000198785 | glutamate ionotropic receptor NMDA type subuni... | NMDA receptor subtype of glutamate-gated ion c... |
| 10 | 0.602390 | 0.208525 | ENSG00000105464 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 11 | 0.601014 | 0.174569 | ENSG00000161509 | glutamate ionotropic receptor NMDA type subuni... | Component of NMDA receptor complexes that func... |
| 12 | 0.587812 | 0.083939 | ENSG00000176749 | cyclin dependent kinase 5 regulatory subunit 1 | p35 is a neuron specific activator of CDK5. Th... |
| 13 | 0.586498 | 0.089855 | ENSG00000080815 | presenilin 1 | Catalytic subunit of the gamma-secretase compl... |
| 14 | 0.583305 | 0.104351 | ENSG00000064687 | ATP binding cassette subfamily A member 7 | Catalyzes the translocation of specific phosph... |
| 15 | 0.565717 | 0.095840 | ENSG00000130203 | apolipoprotein E | APOE is an apolipoprotein, a protein associati... |
| 16 | 0.561265 | 0.091391 | ENSG00000118689 | forkhead box O3 | Transcriptional activator that recognizes and ... |
| 17 | 0.554858 | 0.101208 | ENSG00000073756 | prostaglandin-endoperoxide synthase 2 | Dual cyclooxygenase and peroxidase in the bios... |
| 18 | 0.526961 | 0.109847 | ENSG00000095303 | prostaglandin-endoperoxide synthase 1 | Dual cyclooxygenase and peroxidase in the bios... |
| 19 | 0.517934 | 0.095558 | ENSG00000136717 | bridging integrator 1 | Is a key player in the control of plasma membr... |
| 20 | 0.514720 | 0.108816 | ENSG00000113161 | 3-hydroxy-3-methylglutaryl-CoA reductase | Catalyzes the conversion of (3S)-hydroxy-3-met... |
| 21 | 0.497396 | 0.093533 | ENSG00000198087 | CD2 associated protein | Seems to act as an adapter protein between mem... |
| 22 | 0.493573 | 0.073091 | ENSG00000203710 | complement C3b/C4b receptor 1 (Knops blood group) | Membrane immune adherence receptor that plays ... |
| 23 | 0.490347 | 0.165674 | ENSG00000144285 | sodium voltage-gated channel alpha subunit 1 | Mediates the voltage-dependent sodium ion perm... |
| 24 | 0.485417 | 0.088477 | ENSG00000138613 | aph-1 homolog B, gamma-secretase subunit | Probable subunit of the gamma-secretase comple... |
| 25 | 0.483862 | 0.177092 | ENSG00000196876 | sodium voltage-gated channel alpha subunit 8 | Mediates the voltage-dependent sodium ion perm... |
| 26 | 0.482134 | 0.099628 | ENSG00000114026 | 8-oxoguanine DNA glycosylase | DNA repair enzyme that incises DNA at 8-oxoG r... |
| 27 | 0.480560 | 0.103413 | ENSG00000142319 | solute carrier family 6 member 3 | Mediates sodium- and chloride-dependent transp... |
| 28 | 0.474942 | 0.119958 | ENSG00000204681 | gamma-aminobutyric acid type B receptor subunit 1 | Component of a heterodimeric G-protein coupled... |
| 29 | 0.473011 | 0.111097 | ENSG00000136928 | gamma-aminobutyric acid type B receptor subunit 2 | Component of a heterodimeric G-protein coupled... |
| 30 | 0.471640 | 0.169157 | ENSG00000136531 | sodium voltage-gated channel alpha subunit 2 | Mediates the voltage-dependent sodium ion perm... |
| 31 | 0.470778 | 0.124441 | ENSG00000153253 | sodium voltage-gated channel alpha subunit 3 | Mediates the voltage-dependent sodium ion perm... |
| 32 | 0.470188 | 0.149977 | ENSG00000168356 | sodium voltage-gated channel alpha subunit 11 | This protein mediates the voltage-dependent so... |
| 33 | 0.470188 | 0.162964 | ENSG00000183873 | sodium voltage-gated channel alpha subunit 5 | This protein mediates the voltage-dependent so... |
| 34 | 0.470188 | 0.129318 | ENSG00000185313 | sodium voltage-gated channel alpha subunit 10 | Tetrodotoxin-resistant channel that mediates t... |
| 35 | 0.470188 | 0.142912 | ENSG00000136546 | sodium voltage-gated channel alpha subunit 7 | Mediates the voltage-dependent sodium ion perm... |
| 36 | 0.470188 | 0.173257 | ENSG00000007314 | sodium voltage-gated channel alpha subunit 4 | Pore-forming subunit of a voltage-gated sodium... |
| 37 | 0.470188 | 0.188368 | ENSG00000169432 | sodium voltage-gated channel alpha subunit 9 | Mediates the voltage-dependent sodium ion perm... |
| 38 | 0.463312 | 0.083460 | ENSG00000120885 | clusterin | [Isoform 1]: Functions as extracellular chaper... |
| 39 | 0.463101 | 0.107922 | ENSG00000103546 | solute carrier family 6 member 2 | Mediates sodium- and chloride-dependent transp... |